We study the CRAN distribution of GitHub R Packages. In particular, we focus on a survival analysis based on the time needed for a R package on GitHub to appear on CRAN.
In [1]:
import pandas
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')
cran_release = pandas.DataFrame.from_csv('../data/cran-packages-150601.csv', index_col=None)
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata-150420.csv', index_col=None)
In [2]:
github = data.query('Source == "github"').sort('Date').drop_duplicates('Package').set_index('Package')
github.rename(columns={'Date': 'github'}, inplace=True)
In [3]:
cran = cran_release.sort('mtime').drop_duplicates('package').rename(columns={'package': 'Package', 'mtime': 'cran'})[['Package', 'cran']].set_index('Package')
In [4]:
# Keep GitHub packages
packages = github.join(cran, how='left')
# Fix datetimes
packages['github'] = pandas.to_datetime(packages['github']).astype('datetime64[ns]')
packages['cran'] = pandas.to_datetime(packages['cran']).astype('datetime64[ns]')
# Compute delta & sort (cosmetic)
packages['elapsed'] = packages['cran'] - packages['github']
packages.sort('elapsed', inplace=True)
# In days
packages['elapsed'] = packages['elapsed'] / pandas.np.timedelta64(1, 'D')
# Do GitHub packages have a CRAN dependency?
packages.fillna({'Depends': '', 'Imports': ''}, inplace=True)
cran_deps = lambda r: any((p in cran.index for p in
[x.strip() for x in r['Depends'].split(' ') if len(x.strip())>0] + [x.strip() for x in r['Imports'].split(' ') if len(x.strip())>0]
))
packages['cran_deps'] = packages.apply(cran_deps, axis=1)
In [5]:
ax = packages['elapsed'].plot(kind='hist', bins=50)
ax.set_title('Time needed for a GitHub package to appear on CRAN')
ax.figure.set_size_inches(12, 4)
ax.set_xlabel('Duration in days')
ax.set_ylabel('Number of GitHub R packages')
Out[5]:
Which are those outliers?
In [6]:
packages.sort('elapsed', ascending=False)[:5][['github', 'cran', 'elapsed']]
Out[6]:
Let's prepare the data for a survival analysis.
In [7]:
import lifelines
In [8]:
survival = packages.copy()
# Remove packages that were first on CRAN
survival = survival.fillna(pandas.datetime(2050,1,1)).query('github < cran').replace(pandas.datetime(2050,1,1), pandas.np.nan)
# Observed packages
survival['observed'] = survival['elapsed'].apply(lambda x: not pandas.np.isnan(x))
# Censored packages (NaN) are set to now
survival['elapsed'] = survival.apply(lambda r: r['elapsed'] if not pandas.np.isnan(r['elapsed'])
else (pandas.datetime.now() - r['github']) / pandas.np.timedelta64(1, 'D'), axis=1)
print len(packages), len(survival), len(survival[survival['observed']])
In [9]:
kmf = lifelines.KaplanMeierFitter()
plot_groups = [
[
{'label': 'All', 'df': survival.query('observed or not observed')},
{'label': 'Without outliers', 'df': survival.query('observed == False or elapsed < 1200')},
],[
{'label': 'Version >= 1', 'df': survival[survival['Version'].str[0] >= '1']},
{'label': 'Version < 1', 'df': survival[survival['Version'].str[0] < '1']},
],[
{'label': 'With CRAN dependencies', 'df': survival.query('cran_deps == True')},
{'label': 'Without CRAN dependencies', 'df': survival.query('cran_deps == False')},
],
]
for i, group in enumerate(plot_groups):
ax = plt.subplot(len(plot_groups), 1, i+1)
for cond in group:
print '{}: {} items, {} observed'.format(cond['label'], len(cond['df']), len(cond['df'].query('observed == True')))
kmf.fit(cond['df']['elapsed'], event_observed=cond['df']['observed'], label=cond['label'])
ax = kmf.plot(ax=ax)
ax.set_title('Kaplan Meier Estimation, duration needed for a GitHub package to appear on CRAN')
ax.figure.set_size_inches(12, 8)
ax.set_xlabel('Duration (in days)')